Prepration¶

InĀ [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sys
sys.path.append('..')
from helper import get_latest_table
InĀ [2]:
current_month = pd.Timestamp.now().month
current_year = pd.Timestamp.now().year

cpu_data = get_latest_table('cpu_specs')
gpu_data = get_latest_table('gpu_specs')

full_relation = get_latest_table('full_relation')
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful

Preview the data

InĀ [3]:
print(f"CPU Data: {cpu_data.shape[0]} rows, {cpu_data.shape[1]} columns")
print(f"GPU Data: {gpu_data.shape[0]} rows, {gpu_data.shape[1]} columns")
print(f"Full Relation Data: {full_relation.shape[0]} rows, {full_relation.shape[1]} columns")
CPU Data: 2348 rows, 28 columns
GPU Data: 618 rows, 13 columns
Full Relation Data: 2242 rows, 70 columns

Data Analasys¶

CPU Dataframe¶

Preview the data¶

Dataframe head¶

InĀ [4]:
# Display the first few rows
print(cpu_data.head())
                    name performance_clockspeed performance_turbospeed  \
0   intel core i3 1315ue                   1.20                   4.50   
1     intel core i3 n300                   None                   3.80   
2    intel core i3 1305u                   1.60                   4.50   
3      amd ryzen 3 7320u                   2.40                   4.10   
4  intel core i5 1038ng7                   2.00                   3.80   

   performance_cores  performance_threads efficient_clockspeed  \
0                2.0                  4.0                 None   
1                8.0                  8.0                 None   
2                1.0                  2.0                 1.20   
3                4.0                  8.0                 None   
4                4.0                  8.0                 None   

  efficient_turbospeed  efficient_cores  efficient_threads    tdp  ...  \
0                 3.30              4.0                4.0  15.00  ...   
1                 None              NaN                NaN   7.00  ...   
2                 3.30              4.0                4.0  15.00  ...   
3                 None              NaN                NaN  15.00  ...   
4                 None              NaN                NaN  28.00  ...   

   eff_l2_cache  integer_math floating_point_math find_prime_numbers  \
0          None       34537.0             20958.0               51.0   
1          None       29169.0             19343.0               22.0   
2   1 x 2048 kb       27950.0             20052.0               36.0   
3          None       29638.0             14121.0               20.0   
4          None       27545.0             15238.0               28.0   

  random_string_sorting data_encryption data_compression physics  \
0               10759.0          6321.0         103162.0   824.0   
1               12797.0          7034.0         100731.0   516.0   
2               10623.0          6021.0          95060.0   518.0   
3               13922.0          6266.0         131689.0   437.0   
4               11471.0          5714.0         109286.0   698.0   

  extended_instructions  single_thread  
0                5172.0           3269  
1                5174.0           2122  
2                5262.0           3276  
3                5905.0           2378  
4                6539.0           2152  

[5 rows x 28 columns]

Dataframe tail¶

InĀ [5]:
# Display the first few rows
print(cpu_data.tail())
                                name performance_clockspeed  \
2343                     intel u300e                   1.10   
2344  arm huawei,kunpeng 920 24 core                   2.60   
2345             amd custom apu 0932                   2.40   
2346            intel core i7 10710u                   1.10   
2347            intel core i3 1125g4                   2.00   

     performance_turbospeed  performance_cores  performance_threads  \
2343                   4.30                1.0                  2.0   
2344                   None               24.0                 24.0   
2345                   3.50                4.0                  8.0   
2346                   4.70                6.0                 12.0   
2347                   3.70                4.0                  8.0   

     efficient_clockspeed efficient_turbospeed  efficient_cores  \
2343                 None                 3.20              4.0   
2344                 None                 None              NaN   
2345                 None                 None              NaN   
2346                 None                 None              NaN   
2347                 None                 None              NaN   

      efficient_threads    tdp  ...  eff_l2_cache  integer_math  \
2343                4.0  15.00  ...   1 x 2048 kb       30218.0   
2344                NaN   None  ...          None       91062.0   
2345                NaN  15.00  ...          None       28027.0   
2346                NaN  15.00  ...          None       35167.0   
2347                NaN  28.00  ...          None       29716.0   

     floating_point_math find_prime_numbers random_string_sorting  \
2343             21589.0               45.0               11513.0   
2344             30906.0               48.0               40681.0   
2345             17049.0               23.0               14366.0   
2346             21715.0               31.0               16853.0   
2347             18257.0               34.0               12839.0   

     data_encryption data_compression physics extended_instructions  \
2343          6421.0          98379.0   599.0                5279.0   
2344          2447.0          94224.0   822.0               10829.0   
2345          7582.0         117043.0   613.0                6566.0   
2346          3269.0         128017.0   642.0                8051.0   
2347          5666.0         107758.0   577.0                7990.0   

      single_thread  
2343           3546  
2344            733  
2345           2263  
2346           2336  
2347           2476  

[5 rows x 28 columns]

Check all the features¶

InĀ [6]:
print(cpu_data.columns)
Index(['name', 'performance_clockspeed', 'performance_turbospeed',
       'performance_cores', 'performance_threads', 'efficient_clockspeed',
       'efficient_turbospeed', 'efficient_cores', 'efficient_threads', 'tdp',
       'multithread_rating', 'single_thread_rating', 'l1_instruction_cache',
       'l1_data_cache', 'l2_cache', 'l3_cache', 'eff_l1_instruction_cache',
       'eff_l1_data_cache', 'eff_l2_cache', 'integer_math',
       'floating_point_math', 'find_prime_numbers', 'random_string_sorting',
       'data_encryption', 'data_compression', 'physics',
       'extended_instructions', 'single_thread'],
      dtype='object')

Check the data types and non-null counts¶

InĀ [7]:
print(cpu_data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2348 entries, 0 to 2347
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      2348 non-null   object 
 1   performance_clockspeed    2338 non-null   object 
 2   performance_turbospeed    926 non-null    object 
 3   performance_cores         2259 non-null   float64
 4   performance_threads       2259 non-null   float64
 5   efficient_clockspeed      129 non-null    object 
 6   efficient_turbospeed      115 non-null    object 
 7   efficient_cores           163 non-null    float64
 8   efficient_threads         163 non-null    float64
 9   tdp                       1441 non-null   object 
 10  multithread_rating        2348 non-null   int64  
 11  single_thread_rating      2348 non-null   int64  
 12  l1_instruction_cache      1409 non-null   object 
 13  l1_data_cache             1407 non-null   object 
 14  l2_cache                  1405 non-null   object 
 15  l3_cache                  868 non-null    object 
 16  eff_l1_instruction_cache  103 non-null    object 
 17  eff_l1_data_cache         103 non-null    object 
 18  eff_l2_cache              93 non-null     object 
 19  integer_math              2149 non-null   float64
 20  floating_point_math       2149 non-null   float64
 21  find_prime_numbers        2012 non-null   float64
 22  random_string_sorting     2149 non-null   float64
 23  data_encryption           1155 non-null   float64
 24  data_compression          2149 non-null   float64
 25  physics                   2149 non-null   float64
 26  extended_instructions     2149 non-null   float64
 27  single_thread             2348 non-null   int64  
dtypes: float64(12), int64(3), object(13)
memory usage: 513.8+ KB
None

Look at descriptive statistics¶

InĀ [8]:
print(cpu_data.describe())
       performance_cores  performance_threads  efficient_cores  \
count        2259.000000          2259.000000       163.000000   
mean            4.544046             5.947764         6.791411   
std             2.801395             4.002537         2.879159   
min             1.000000             1.000000         2.000000   
25%             2.000000             4.000000         4.000000   
50%             4.000000             4.000000         8.000000   
75%             8.000000             8.000000         8.000000   
max            32.000000            32.000000        16.000000   

       efficient_threads  multithread_rating  single_thread_rating  \
count         163.000000         2348.000000           2348.000000   
mean            6.957055         5056.670358           1393.641823   
std             3.081906         7341.559596           1016.341297   
min             2.000000           93.000000             95.000000   
25%             4.000000          840.500000            568.000000   
50%             8.000000         2168.500000           1086.500000   
75%             8.000000         5709.250000           1951.250000   
max            16.000000        57389.000000           4786.000000   

        integer_math  floating_point_math  find_prime_numbers  \
count    2149.000000          2149.000000         2012.000000   
mean    21716.518381         11977.772918           24.540258   
std     25417.697425         18214.180379           48.549368   
min       122.000000           166.000000            1.000000   
25%      5139.000000          1985.000000            5.000000   
50%     13523.000000          4760.000000           10.000000   
75%     25358.000000         12910.000000           23.250000   
max    209791.000000        131787.000000          619.000000   

       random_string_sorting  data_encryption  data_compression      physics  \
count            2149.000000      1155.000000       2149.000000  2149.000000   
mean             9679.062355      6004.123810      73248.829688   367.891112   
std             10193.531835      6337.221465      90040.342566   519.354517   
min               294.000000      1025.000000       2023.000000    14.000000   
25%              2917.000000      1869.000000      18278.000000    93.000000   
50%              5869.000000      3258.000000      38372.000000   184.000000   
75%             12586.000000      7656.500000      90448.000000   415.000000   
max             81685.000000     43769.000000     719086.000000  6476.000000   

       extended_instructions  single_thread  
count            2149.000000    2348.000000  
mean             3776.795254    1393.641823  
std              6023.329546    1016.341297  
min                25.000000      95.000000  
25%               537.000000     568.000000  
50%              1354.000000    1086.500000  
75%              3514.000000    1951.250000  
max             52490.000000    4786.000000  

Feature Analysis¶

Overall Performance Ratings¶

Features:

  • multithread_rating, single_thread_rating
Distribution of ratings¶
InĀ [9]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot single_thread_rating distribution
sns.histplot(cpu_data['single_thread_rating'], ax=axes[0], color='blue', kde=True)
axes[0].set_title("Single Thread Rating Distribution")
axes[0].set_xlabel('Single Thread Rating')
axes[0].set_ylabel('Frequency')

# Plot multithread_rating distribution
sns.histplot(cpu_data['multithread_rating'], ax=axes[1], color='green', kde=True)
axes[1].set_title("Multithread Rating Distribution")
axes[1].set_xlabel('Multithread Rating')
axes[1].set_ylabel('Frequency')

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
InĀ [10]:
# Generate statistics for single_thread_rating
single_thread_stats = cpu_data['single_thread_rating'].describe()
print("Single Thread Rating Statistics:")
print(single_thread_stats)

# Generate statistics for multithread_rating
multithread_stats = cpu_data['multithread_rating'].describe()
print("\nMultithread Rating Statistics:")
print(multithread_stats)
Single Thread Rating Statistics:
count    2348.000000
mean     1393.641823
std      1016.341297
min        95.000000
25%       568.000000
50%      1086.500000
75%      1951.250000
max      4786.000000
Name: single_thread_rating, dtype: float64

Multithread Rating Statistics:
count     2348.000000
mean      5056.670358
std       7341.559596
min         93.000000
25%        840.500000
50%       2168.500000
75%       5709.250000
max      57389.000000
Name: multithread_rating, dtype: float64
Single vs Multithreaded¶
InĀ [11]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=cpu_data, x='single_thread_rating', y='multithread_rating', alpha=0.7)

# Add titles and labels
plt.title("Single Thread Rating vs Multithread Rating", fontsize=16)
plt.xlabel("Single Thread Rating", fontsize=14)
plt.ylabel("Multithread Rating", fontsize=14)
plt.grid(True)

# Show the plot
plt.show()

# Calculate and print the correlation
correlation = cpu_data['single_thread_rating'].corr(cpu_data['multithread_rating'])
print(f"The correlation between single_thread_rating and multithread_rating is: {correlation:.2f}")
No description has been provided for this image
The correlation between single_thread_rating and multithread_rating is: 0.88

Clockspeed metrics¶

Features:

  • performance_clockspeed, performance_turbospeed
  • efficient_clockspeed, efficient_turbospeed
Distribution¶
InĀ [12]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance clockspeed
sns.kdeplot(cpu_data['performance_clockspeed'].dropna(), ax=axes[0, 0], color='blue', fill=True)
axes[0, 0].set_title("Performance Cores' Clockspeed Distribution")
axes[0, 0].set_xlabel('Clockspeed (GHz)')
axes[0, 0].set_ylabel('Density')

# Plot performance turbospeed
sns.kdeplot(cpu_data['performance_turbospeed'].dropna(), ax=axes[0, 1], color='green', fill=True)
axes[0, 1].set_title("Performance Cores' Turbospeed Distribution")
axes[0, 1].set_xlabel('Turbospeed (GHz)')
axes[0, 1].set_ylabel('Density')

# Plot efficient clockspeed
sns.kdeplot(cpu_data['efficient_clockspeed'].dropna(), ax=axes[1, 0], color='red', fill=True)
axes[1, 0].set_title("Efficient Cores' Clockspeed Distribution")
axes[1, 0].set_xlabel('Clockspeed (GHz)')
axes[1, 0].set_ylabel('Density')

# Plot efficient turbospeed
sns.kdeplot(cpu_data['efficient_turbospeed'].dropna(), ax=axes[1, 1], color='purple', fill=True)
axes[1, 1].set_title("Efficient Cores' Turbospeed Distribution")
axes[1, 1].set_xlabel('Turbospeed (GHz)')
axes[1, 1].set_ylabel('Density')

# Determine common x and y limits for all plots
x_min = min(
    cpu_data['performance_clockspeed'].min(),
    cpu_data['performance_turbospeed'].min(),
    cpu_data['efficient_clockspeed'].min(),
    cpu_data['efficient_turbospeed'].min(),
)

x_max = max(
    cpu_data['performance_clockspeed'].max(),
    cpu_data['performance_turbospeed'].max(),
    cpu_data['efficient_clockspeed'].max(),
    cpu_data['efficient_turbospeed'].max(),
)

y_max = max(ax.get_ylim()[1] for ax in axes.flat)  # Find the maximum y limit among all plots

# Set common limits
for ax in axes.flat:
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(0, y_max)

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
Correlation with Performance¶
InĀ [13]:
# Calculate correlations
correlation_performance_single = cpu_data['performance_clockspeed'].corr(cpu_data['single_thread_rating'])
correlation_performance_multi = cpu_data['performance_clockspeed'].corr(cpu_data['multithread_rating'])
correlation_efficient_single = cpu_data['efficient_clockspeed'].corr(cpu_data['single_thread_rating'])
correlation_efficient_multi = cpu_data['efficient_clockspeed'].corr(cpu_data['multithread_rating'])

# Print the results
print(f"Correlation between performance_clockspeed and single_thread_rating: {correlation_performance_single:.2f}")
print(f"Correlation between performance_clockspeed and multithread_rating: {correlation_performance_multi:.2f}")
print(f"Correlation between efficient_clockspeed and single_thread_rating: {correlation_efficient_single:.2f}")
print(f"Correlation between efficient_clockspeed and multithread_rating: {correlation_efficient_multi:.2f}")
Correlation between performance_clockspeed and single_thread_rating: 0.61
Correlation between performance_clockspeed and multithread_rating: 0.48
Correlation between efficient_clockspeed and single_thread_rating: 0.21
Correlation between efficient_clockspeed and multithread_rating: 0.14
InĀ [14]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Ensure columns are numeric
cpu_data_clone['performance_clockspeed'] = pd.to_numeric(cpu_data_clone['performance_clockspeed'], errors='coerce')
cpu_data_clone['efficient_clockspeed'] = pd.to_numeric(cpu_data_clone['efficient_clockspeed'], errors='coerce')
cpu_data_clone['single_thread_rating'] = pd.to_numeric(cpu_data_clone['single_thread_rating'], errors='coerce')
cpu_data_clone['multithread_rating'] = pd.to_numeric(cpu_data_clone['multithread_rating'], errors='coerce')

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance_clockspeed vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='performance_clockspeed', y='single_thread_rating', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title('Performance Clockspeed vs Single Thread Rating')
axes[0, 0].set_xlabel('Performance Clockspeed (GHz)')
axes[0, 0].set_ylabel('Single Thread Rating')

# Plot performance_clockspeed vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='performance_clockspeed', y='multithread_rating', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title('Performance Clockspeed vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Clockspeed (GHz)')
axes[0, 1].set_ylabel('Multithread Rating')

# Plot efficient_clockspeed vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='efficient_clockspeed', y='single_thread_rating', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title('Efficient Clockspeed vs Single Thread Rating')
axes[1, 0].set_xlabel('Efficient Clockspeed (GHz)')
axes[1, 0].set_ylabel('Single Thread Rating')

# Plot efficient_clockspeed vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='efficient_clockspeed', y='multithread_rating', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title('Efficient Clockspeed vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Clockspeed (GHz)')
axes[1, 1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image
Boost impact¶
InĀ [15]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Convert columns to numeric, forcing errors to NaN
cpu_data_clone['performance_turbospeed'] = pd.to_numeric(cpu_data_clone['performance_turbospeed'], errors='coerce')
cpu_data_clone['performance_clockspeed'] = pd.to_numeric(cpu_data_clone['performance_clockspeed'], errors='coerce')
cpu_data_clone['efficient_turbospeed'] = pd.to_numeric(cpu_data_clone['efficient_turbospeed'], errors='coerce')
cpu_data_clone['efficient_clockspeed'] = pd.to_numeric(cpu_data_clone['efficient_clockspeed'], errors='coerce')

# Compute turbo boost margins
cpu_data_clone['performance_turbo_boost'] = cpu_data_clone['performance_turbospeed'] - cpu_data_clone['performance_clockspeed']
cpu_data_clone['efficient_turbo_boost'] = cpu_data_clone['efficient_turbospeed'] - cpu_data_clone['efficient_clockspeed']

# Analyze turbo boost impact on single_thread_rating and multithread_rating
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Performance turbo boost vs single_thread_rating
sns.regplot(data=cpu_data_clone, x='performance_turbo_boost', y='single_thread_rating', ax=axes[0, 0], color='blue')
axes[0, 0].set_title('Performance Turbo Boost vs Single Thread Rating')
axes[0, 0].set_xlabel('Performance Turbo Boost (GHz)')
axes[0, 0].set_ylabel('Single Thread Rating')

# Performance turbo boost vs multithread_rating
sns.regplot(data=cpu_data_clone, x='performance_turbo_boost', y='multithread_rating', ax=axes[0, 1], color='green')
axes[0, 1].set_title('Performance Turbo Boost vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Turbo Boost (GHz)')
axes[0, 1].set_ylabel('Multithread Rating')

# Efficient turbo boost vs single_thread_rating
sns.regplot(data=cpu_data_clone, x='efficient_turbo_boost', y='single_thread_rating', ax=axes[1, 0], color='red')
axes[1, 0].set_title('Efficient Turbo Boost vs Single Thread Rating')
axes[1, 0].set_xlabel('Efficient Turbo Boost (GHz)')
axes[1, 0].set_ylabel('Single Thread Rating')

# Efficient turbo boost vs multithread_rating
sns.regplot(data=cpu_data_clone, x='efficient_turbo_boost', y='multithread_rating', ax=axes[1, 1], color='purple')
axes[1, 1].set_title('Efficient Turbo Boost vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Turbo Boost (GHz)')
axes[1, 1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image

Core & Thread Analysis¶

Features:

  • performance_cores, performance_threads
  • efficient_cores, efficient_threads
Distribution¶
InĀ [16]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance cores
sns.histplot(cpu_data['performance_cores'].dropna(), ax=axes[0, 0], color='blue', kde=True)
axes[0, 0].set_title("Performance Cores Distribution")
axes[0, 0].set_xlabel('Number of Cores')
axes[0, 0].set_ylabel('Frequency')

# Plot performance threads
sns.histplot(cpu_data['performance_threads'].dropna(), ax=axes[0, 1], color='green', kde=True)
axes[0, 1].set_title("Performance Threads Distribution")
axes[0, 1].set_xlabel('Number of Threads')
axes[0, 1].set_ylabel('Frequency')

# Plot efficient cores
sns.histplot(cpu_data['efficient_cores'].dropna(), ax=axes[1, 0], color='red', kde=True)
axes[1, 0].set_title("Efficient Cores Distribution")
axes[1, 0].set_xlabel('Number of Cores')
axes[1, 0].set_ylabel('Frequency')

# Plot efficient threads
sns.histplot(cpu_data['efficient_threads'].dropna(), ax=axes[1, 1], color='purple', kde=True)
axes[1, 1].set_title("Efficient Threads Distribution")
axes[1, 1].set_xlabel('Number of Threads')
axes[1, 1].set_ylabel('Frequency')

# Determine common x and y limits for all plots
x_min = min(
    cpu_data['performance_cores'].min(),
    cpu_data['performance_threads'].min(),
    cpu_data['efficient_cores'].min(),
    cpu_data['efficient_threads'].min(),
)

x_max = max(
    cpu_data['performance_cores'].max(),
    cpu_data['performance_threads'].max(),
    cpu_data['efficient_cores'].max(),
    cpu_data['efficient_threads'].max(),
)

y_max = max(ax.get_ylim()[1] for ax in axes.flat)  # Get the maximum y-limit among all plots

# Set common x and y limits for all subplots
for ax in axes.flat:
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(0, y_max)

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
InĀ [17]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Calculate core/thread ratio for performance and efficient cores
cpu_data_clone['performance_core_thread_ratio'] = cpu_data_clone['performance_cores'] / cpu_data_clone['performance_threads']
cpu_data_clone['efficient_core_thread_ratio'] = cpu_data_clone['efficient_cores'] / cpu_data_clone['efficient_threads']

# Calculate frequency counts for each ratio
performance_ratio_counts = cpu_data_clone['performance_core_thread_ratio'].value_counts().sort_index()
efficient_ratio_counts = cpu_data_clone['efficient_core_thread_ratio'].value_counts().sort_index()

# Print the frequency of core/thread ratios
print("Performance Core/Thread Ratio Frequencies:")
print(performance_ratio_counts)

print("\nEfficient Core/Thread Ratio Frequencies:")
print(efficient_ratio_counts)
Performance Core/Thread Ratio Frequencies:
performance_core_thread_ratio
0.5     774
1.0    1485
Name: count, dtype: int64

Efficient Core/Thread Ratio Frequencies:
efficient_core_thread_ratio
0.5      4
1.0    159
Name: count, dtype: int64
Multi-threading impact¶
InĀ [18]:
# Calculate correlations
correlation_performance_cores = cpu_data['performance_cores'].corr(cpu_data['multithread_rating'])
correlation_performance_threads = cpu_data['performance_threads'].corr(cpu_data['multithread_rating'])
correlation_efficient_cores = cpu_data['efficient_cores'].corr(cpu_data['multithread_rating'])
correlation_efficient_threads = cpu_data['efficient_threads'].corr(cpu_data['multithread_rating'])

# Print the results
print(f"Correlation between performance_cores and multithread_rating: {correlation_performance_cores:.2f}")
print(f"Correlation between performance_threads and multithread_rating: {correlation_performance_threads:.2f}")
print(f"Correlation between efficient_cores and multithread_rating: {correlation_efficient_cores:.2f}")
print(f"Correlation between efficient_threads and multithread_rating: {correlation_efficient_threads:.2f}")
Correlation between performance_cores and multithread_rating: 0.41
Correlation between performance_threads and multithread_rating: 0.74
Correlation between efficient_cores and multithread_rating: 0.47
Correlation between efficient_threads and multithread_rating: 0.49
InĀ [19]:
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance_cores vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='performance_cores', y='multithread_rating', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title('Performance Cores vs Multithread Rating')
axes[0, 0].set_xlabel('Performance Cores')
axes[0, 0].set_ylabel('Multithread Rating')

# Plot performance_threads vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='performance_threads', y='multithread_rating', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title('Performance Threads vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Threads')
axes[0, 1].set_ylabel('Multithread Rating')

# Plot efficient_cores vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='efficient_cores', y='multithread_rating', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title('Efficient Cores vs Multithread Rating')
axes[1, 0].set_xlabel('Efficient Cores')
axes[1, 0].set_ylabel('Multithread Rating')

# Plot efficient_threads vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='efficient_threads', y='multithread_rating', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title('Efficient Threads vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Threads')
axes[1, 1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image

Power Consumption (TDP)¶

Features:

  • TDP
TDP vs Performance¶
InĀ [20]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Ensure 'tdp' column is numeric
cpu_data_clone['tdp'] = pd.to_numeric(cpu_data_clone['tdp'], errors='coerce')

# Calculate correlations
correlation_tdp_single = cpu_data_clone['tdp'].corr(cpu_data_clone['single_thread_rating'])
correlation_tdp_multi = cpu_data_clone['tdp'].corr(cpu_data_clone['multithread_rating'])

# Print the results
print(f"Correlation between TDP and single_thread_rating: {correlation_tdp_single:.2f}")
print(f"Correlation between TDP and multithread_rating: {correlation_tdp_multi:.2f}")
Correlation between TDP and single_thread_rating: 0.39
Correlation between TDP and multithread_rating: 0.43
InĀ [21]:
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot TDP vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='tdp', y='single_thread_rating', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('TDP vs Single Thread Rating')
axes[0].set_xlabel('TDP (W)')
axes[0].set_ylabel('Single Thread Rating')

# Plot TDP vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='tdp', y='multithread_rating', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('TDP vs Multithread Rating')
axes[1].set_xlabel('TDP (W)')
axes[1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image
Efficiency Analysis¶
InĀ [22]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Ensure 'tdp' column is numeric
cpu_data_clone['tdp'] = pd.to_numeric(cpu_data_clone['tdp'], errors='coerce')

# Calculate performance efficiency
cpu_data_clone['performance_efficiency'] = cpu_data_clone['multithread_rating'] / cpu_data_clone['tdp']

# Drop rows with NaN values in 'performance_efficiency'
cpu_data_clone = cpu_data_clone.dropna(subset=['performance_efficiency'])

# Filter out rows where 'performance_efficiency' is less than or equal to 0
cpu_data_clone = cpu_data_clone[cpu_data_clone['performance_efficiency'] > 0]

# Sort the DataFrame by 'performance_efficiency'
cpu_data_clone = cpu_data_clone.sort_values(by='performance_efficiency', ascending=False)

# Display the top 5 rows of the updated DataFrame
print("Top 5 rows:")
print(cpu_data_clone[['name', 'multithread_rating', 'tdp', 'performance_efficiency']].head())

# Display the bottom 5 rows of the updated DataFrame
print("\nBottom 5 rows:")
print(cpu_data_clone[['name', 'multithread_rating', 'tdp', 'performance_efficiency']].tail())

# Plot the distribution of performance efficiency
plt.figure(figsize=(10, 6))
sns.histplot(cpu_data_clone['performance_efficiency'], kde=True, color="blue", bins=30)
plt.title("Performance Efficiency Distribution", fontsize=16)
plt.xlabel("Performance Efficiency (multithread_rating / tdp)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()
Top 5 rows:
                        name  multithread_rating   tdp  performance_efficiency
117  intel core ultra 7 164u               15187   9.0             1687.444444
470     amd ryzen z1 extreme               25182  15.0             1678.800000
82             apple a18 pro               13063   8.0             1632.875000
91       intel core i7 1260u               14001   9.0             1555.666667
52       intel core i7 1250u               11673   9.0             1297.000000

Bottom 5 rows:
                             name  multithread_rating   tdp  \
735  mobile amd athlon xp-m 1800+                 193  45.0   
980    mobile amd athlon 64 3400+                 333  81.5   
974    mobile amd athlon 64 3200+                 326  81.5   
671            intel celeron b710                 106  35.0   
681  mobile intel celeron 1.80ghz                 121  66.1   

     performance_efficiency  
735                4.288889  
980                4.085890  
974                4.000000  
671                3.028571  
681                1.830560  
No description has been provided for this image

GPU Dataframe¶

Preview the data¶

Dataframe head¶

InĀ [23]:
# Display the first few rows
print(gpu_data.head())
                      name  avg_g3d_mark bus_interface  max_memory_size  \
0          radeon rx 6600m         13814   pcie 4.0 x8           8192.0   
1      radeont rx 6850m xt         13848  pcie 4.0 x16          12288.0   
2  rtx 1000 ada generation         14043          None              NaN   
3           rtx a3000 12gb         14088          None              NaN   
4         geforce rtx 4050         14433  pcie 4.0 x16           6144.0   

   core_clock max_direct open_gl  max_tdp  test_directx_9  test_directx_10  \
0      2068.0       12_2     4.6    100.0           180.0             89.0   
1      2321.0       12_2     4.6    165.0           144.0            106.0   
2         NaN       None    None      NaN           179.0             74.0   
3         NaN       None    None      NaN           169.0             88.0   
4      1605.0       12_2     4.6    115.0           186.0             81.0   

   test_directx_11  test_directx_12  test_gpu_compute  
0            135.0             52.0            5752.0  
1            166.0             59.0            5210.0  
2            115.0             65.0            5471.0  
3            115.0             65.0            5593.0  
4            131.0             61.0            5943.0  

Dataframe tail¶

InĀ [24]:
# Display the last few rows
print(gpu_data.tail())
                        name  avg_g3d_mark bus_interface  max_memory_size  \
613          radeon rx 7900m         22752          None              NaN   
614  rtx 4000 ada generation         22962          None              NaN   
615  rtx 5000 ada generation         24006          None              NaN   
616         geforce rtx 4080         25076  pcie 4.0 x16          12288.0   
617         geforce rtx 4090         27754  pcie 4.0 x16          16384.0   

     core_clock max_direct open_gl  max_tdp  test_directx_9  test_directx_10  \
613         NaN       None    None      NaN           267.0            127.0   
614         NaN       None    None      NaN           271.0            140.0   
615         NaN       None    None      NaN           272.0            153.0   
616      1860.0       12_2     4.6    150.0           286.0            161.0   
617      1455.0       12_2     4.6    150.0           315.0            181.0   

     test_directx_11  test_directx_12  test_gpu_compute  
613            256.0             93.0            9297.0  
614            224.0            100.0            9232.0  
615            239.0            102.0            9553.0  
616            248.0             96.0           11422.0  
617            270.0            107.0           12650.0  

Check all the features¶

InĀ [25]:
print(gpu_data.columns)
Index(['name', 'avg_g3d_mark', 'bus_interface', 'max_memory_size',
       'core_clock', 'max_direct', 'open_gl', 'max_tdp', 'test_directx_9',
       'test_directx_10', 'test_directx_11', 'test_directx_12',
       'test_gpu_compute'],
      dtype='object')

Check the data types and non-null counts¶

InĀ [26]:
print(gpu_data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              618 non-null    object 
 1   avg_g3d_mark      618 non-null    int64  
 2   bus_interface     349 non-null    object 
 3   max_memory_size   342 non-null    float64
 4   core_clock        309 non-null    float64
 5   max_direct        353 non-null    object 
 6   open_gl           346 non-null    object 
 7   max_tdp           245 non-null    float64
 8   test_directx_9    340 non-null    float64
 9   test_directx_10   340 non-null    float64
 10  test_directx_11   340 non-null    float64
 11  test_directx_12   340 non-null    float64
 12  test_gpu_compute  340 non-null    float64
dtypes: float64(8), int64(1), object(4)
memory usage: 62.9+ KB
None

Look at descriptive statistics¶

InĀ [27]:
print(gpu_data.describe())
       avg_g3d_mark  max_memory_size   core_clock     max_tdp  test_directx_9  \
count    618.000000       342.000000   309.000000  245.000000      340.000000   
mean    2784.377023      2852.590643   756.132686   58.142857       64.752941   
std     4605.472224      3298.820120   366.364012   38.361524       67.330367   
min        2.000000         2.000000   143.000000    7.000000        1.000000   
25%      358.000000       512.000000   500.000000   25.000000       11.000000   
50%      671.500000      2048.000000   660.000000   50.000000       36.000000   
75%     2697.000000      4096.000000   954.000000   80.000000      107.250000   
max    27754.000000     16384.000000  2321.000000  165.000000      315.000000   

       test_directx_10  test_directx_11  test_directx_12  test_gpu_compute  
count       340.000000       340.000000       340.000000        340.000000  
mean         26.597059        38.252941        19.311765       1892.626471  
std          37.149137        51.488414        24.423437       2288.641490  
min           0.000000         0.000000         0.000000          0.000000  
25%           2.000000         4.000000         0.000000        239.500000  
50%           7.000000        15.000000         7.500000        806.000000  
75%          35.000000        54.000000        31.000000       2865.000000  
max         181.000000       270.000000       107.000000      12650.000000  

Feature Analysis¶

Clock Speed Analysis¶

Features:

  • core_clock
Distribution¶
InĀ [28]:
# Plot the distribution of core_clock
plt.figure(figsize=(10, 6))
sns.histplot(gpu_data['core_clock'].dropna(), kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of GPU Core Clock Speeds", fontsize=16)
plt.xlabel("Core Clock (MHz)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image
Impact on Performance¶
InĀ [29]:
# Calculate correlation coefficients
correlation_core_clock_avg_g3d_mark = gpu_data['core_clock'].corr(gpu_data['avg_g3d_mark'])
correlation_core_clock_test_directx_9 = gpu_data['core_clock'].corr(gpu_data['test_directx_9'])
correlation_core_clock_test_directx_10 = gpu_data['core_clock'].corr(gpu_data['test_directx_10'])
correlation_core_clock_test_directx_11 = gpu_data['core_clock'].corr(gpu_data['test_directx_11'])
correlation_core_clock_test_directx_12 = gpu_data['core_clock'].corr(gpu_data['test_directx_12'])
correlation_core_clock_test_gpu_compute = gpu_data['core_clock'].corr(gpu_data['test_gpu_compute'])

# Print correlation coefficients
print(f"Correlation between core_clock and avg_g3d_mark: {correlation_core_clock_avg_g3d_mark:.2f}")
print(f"Correlation between core_clock and test_directx_9: {correlation_core_clock_test_directx_9:.2f}")
print(f"Correlation between core_clock and test_directx_10: {correlation_core_clock_test_directx_10:.2f}")
print(f"Correlation between core_clock and test_directx_11: {correlation_core_clock_test_directx_11:.2f}")
print(f"Correlation between core_clock and test_directx_12: {correlation_core_clock_test_directx_12:.2f}")
print(f"Correlation between core_clock and test_gpu_compute: {correlation_core_clock_test_gpu_compute:.2f}")

# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(3, 2, figsize=(14, 18))

# Plot core_clock vs avg_g3d_mark
sns.regplot(data=gpu_data, x='core_clock', y='avg_g3d_mark', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title(f"Core Clock vs Avg G3D Mark (Correlation: {correlation_core_clock_avg_g3d_mark:.2f})")

# Plot core_clock vs test_directx_9
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_9', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title(f"Core Clock vs Test DirectX 9 (Correlation: {correlation_core_clock_test_directx_9:.2f})")

# Plot core_clock vs test_directx_10
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_10', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title(f"Core Clock vs Test DirectX 10 (Correlation: {correlation_core_clock_test_directx_10:.2f})")

# Plot core_clock vs test_directx_11
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_11', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title(f"Core Clock vs Test DirectX 11 (Correlation: {correlation_core_clock_test_directx_11:.2f})")

# Plot core_clock vs test_directx_12
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_12', ax=axes[2, 0], color='orange', scatter_kws={'s': 10})
axes[2, 0].set_title(f"Core Clock vs Test DirectX 12 (Correlation: {correlation_core_clock_test_directx_12:.2f})")

# Plot core_clock vs test_gpu_compute
sns.regplot(data=gpu_data, x='core_clock', y='test_gpu_compute', ax=axes[2, 1], color='brown', scatter_kws={'s': 10})
axes[2, 1].set_title(f"Core Clock vs Test GPU Compute (Correlation: {correlation_core_clock_test_gpu_compute:.2f})")

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
Correlation between core_clock and avg_g3d_mark: 0.71
Correlation between core_clock and test_directx_9: 0.70
Correlation between core_clock and test_directx_10: 0.63
Correlation between core_clock and test_directx_11: 0.68
Correlation between core_clock and test_directx_12: 0.70
Correlation between core_clock and test_gpu_compute: 0.68
No description has been provided for this image

Memory and Bandwidth Analysis¶

Features:

  • max_memory_size
  • bus_interface
Memory Size¶
InĀ [30]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Define the memory size categories with handling for NaN values
def categorize_memory_size(memory_size):
    if pd.isna(memory_size):  # Check if the value is NaN
        return 'Unknown'
    elif memory_size <= 2048:
        return '<2GB'
    elif 2048 < memory_size <= 4096:
        return '2–4GB'
    elif 4096 < memory_size <= 8192:
        return '4–8GB'
    elif 8192 < memory_size <= 16384:
        return '8–16GB'
    else:
        return '>16GB'

# Apply the categorization function to the 'max_memory_size' column
gpu_data_clone['memory_size_category'] = gpu_data_clone['max_memory_size'].apply(categorize_memory_size)

# Group by the memory size category and calculate the average avg_g3d_mark
memory_size_comparison = gpu_data_clone.groupby('memory_size_category')['avg_g3d_mark'].mean()

# Exclude the 'Unknown' category from the comparison
memory_size_comparison = memory_size_comparison[memory_size_comparison.index != 'Unknown']

# Check the unique categories in the memory_size_comparison DataFrame
print("Unique categories in memory_size_comparison:", memory_size_comparison.index)

# Define the custom order of memory size categories
category_order = ['<2GB', '2–4GB', '4–8GB', '8–16GB', '>16GB']

# Ensure that the order only includes categories that are present in the data
category_order = [category for category in category_order if category in memory_size_comparison.index]

# Sort the memory_size_comparison based on the custom order
memory_size_comparison = memory_size_comparison[category_order]

# Print the results
print(memory_size_comparison)

# Plot the comparison
plt.figure(figsize=(10, 6))
memory_size_comparison.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Average G3D Mark by GPU Memory Size Category", fontsize=16)
plt.xlabel("Memory Size Category", fontsize=14)
plt.ylabel("Average G3D Mark", fontsize=14)
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Show the plot
plt.show()
Unique categories in memory_size_comparison: Index(['2–4GB', '4–8GB', '8–16GB', '<2GB'], dtype='object', name='memory_size_category')
memory_size_category
<2GB        579.118182
2–4GB      3846.544118
4–8GB     11477.357143
8–16GB    16148.416667
Name: avg_g3d_mark, dtype: float64
No description has been provided for this image
Bus Interface¶
InĀ [31]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Filter out rows with missing bus_interface or avg_g3d_mark
filtered_gpu_data_clone = gpu_data_clone.dropna(subset=['bus_interface', 'avg_g3d_mark'])

# Group by bus_interface and calculate the average avg_g3d_mark
bus_interface_performance = filtered_gpu_data_clone.groupby('bus_interface')['avg_g3d_mark'].mean().sort_values()

# Plot the results
plt.figure(figsize=(12, 6))
sns.barplot(y=bus_interface_performance.index, x=bus_interface_performance.values, palette="viridis", orient='h')
plt.title("Impact of Bus Interface on GPU Performance (avg_g3d_mark)", fontsize=16)
plt.xlabel("Average G3D Mark", fontsize=14)
plt.ylabel("Bus Interface", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Show the plot
plt.show()
/tmp/ipykernel_110005/3052634162.py:12: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=bus_interface_performance.index, x=bus_interface_performance.values, palette="viridis", orient='h')
No description has been provided for this image

Power Consumption (TDP)¶

Features:

  • max_tdp
Performance vs Power¶
InĀ [32]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Ensure 'max_tdp' column is numeric
gpu_data_clone['max_tdp'] = pd.to_numeric(gpu_data_clone['max_tdp'], errors='coerce')

# Calculate correlation
correlation_tdp_g3d = gpu_data_clone['max_tdp'].corr(gpu_data_clone['avg_g3d_mark'])

# Print the correlation result
print(f"Correlation between max_tdp and avg_g3d_mark: {correlation_tdp_g3d:.2f}")

# Plot the relationship
plt.figure(figsize=(10, 6))
sns.regplot(data=gpu_data_clone, x='max_tdp', y='avg_g3d_mark', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Max TDP vs Avg G3D Mark", fontsize=16)
plt.xlabel("Max TDP (W)", fontsize=14)
plt.ylabel("Avg G3D Mark", fontsize=14)
plt.grid(True)

# Show the plot
plt.show()
Correlation between max_tdp and avg_g3d_mark: 0.75
No description has been provided for this image
Efficiency¶
InĀ [33]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Ensure 'avg_g3d_mark' and 'max_tdp' columns are numeric
gpu_data_clone['avg_g3d_mark'] = pd.to_numeric(gpu_data_clone['avg_g3d_mark'], errors='coerce')
gpu_data_clone['max_tdp'] = pd.to_numeric(gpu_data_clone['max_tdp'], errors='coerce')

# Exclude rows where 'max_tdp' is NaN
gpu_data_clone = gpu_data_clone.dropna(subset=['max_tdp'])

# Compute performance efficiency
gpu_data_clone['efficiency'] = gpu_data_clone['avg_g3d_mark'] / gpu_data_clone['max_tdp']

# Sort the DataFrame by 'efficiency'
gpu_data_sorted = gpu_data_clone.sort_values(by='efficiency', ascending=False)

# Display the top 5 rows of the sorted DataFrame
print("Top 5 GPUs by Efficiency:")
print(gpu_data_sorted[['name', 'avg_g3d_mark', 'max_tdp', 'efficiency']].head())

# Display the bottom 5 rows of the sorted DataFrame
print("\nBottom 5 GPUs by Efficiency:")
print(gpu_data_sorted[['name', 'avg_g3d_mark', 'max_tdp', 'efficiency']].tail())
Top 5 GPUs by Efficiency:
                 name  avg_g3d_mark  max_tdp  efficiency
510  radeon pro w6300          5560     25.0  222.400000
591   radeon rx 7600s         14732     75.0  196.426667
593   radeon rx 6700s         14974     80.0  187.175000
617  geforce rtx 4090         27754    150.0  185.026667
556  radeon pro 5600m          9233     50.0  184.660000

Bottom 5 GPUs by Efficiency:
                    name  avg_g3d_mark  max_tdp  efficiency
103       radeon hd 6320           147     45.0    3.266667
121  geforce go 7800 gtx           210     65.0    3.230769
84        radeon hd 6310           122     45.0    2.711111
63        radeon hd 6250            94     35.0    2.685714
70        radeon hd 6290           105     45.0    2.333333

Overall Performance Ratings¶

Features:

  • avg_g3d_mark (3DMark score)
  • test_gpu_compute (compute performance)
Distribution of ratings¶
InĀ [34]:
# Plot the distribution of avg_g3d_mark
plt.figure(figsize=(12, 6))
sns.histplot(gpu_data['avg_g3d_mark'].dropna(), kde=True, color='blue', bins=30)
plt.title("Distribution of Avg G3D Mark", fontsize=16)
plt.xlabel("Avg G3D Mark", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()

# Plot the distribution of test_gpu_compute
plt.figure(figsize=(12, 6))
sns.histplot(gpu_data['test_gpu_compute'].dropna(), kde=True, color='green', bins=30)
plt.title("Distribution of Test GPU Compute", fontsize=16)
plt.xlabel("Test GPU Compute", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
Compute vs Gaming¶
InĀ [35]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Create performance categories based on avg_g3d_mark
bins = [0, 2000, 4000, 6000, 8000, 10000]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
gpu_data_clone['performance_category'] = pd.cut(gpu_data_clone['avg_g3d_mark'], bins=bins, labels=labels)

# Calculate correlation
correlation_gaming_compute = gpu_data_clone['avg_g3d_mark'].corr(gpu_data_clone['test_gpu_compute'])

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=gpu_data_clone, x='avg_g3d_mark', y='test_gpu_compute', hue='performance_category', alpha=0.7)

# Add titles and labels
plt.title(f"Avg G3D Mark vs Test GPU Compute (Correlation: {correlation_gaming_compute:.2f})", fontsize=16)
plt.xlabel("Avg G3D Mark (Gaming Performance)", fontsize=14)
plt.ylabel("Test GPU Compute (Compute Performance)", fontsize=14)
plt.grid(True)

# Show plot
plt.show()

# Print correlation
print(f"The correlation between avg_g3d_mark and test_gpu_compute is: {correlation_gaming_compute:.2f}")
No description has been provided for this image
The correlation between avg_g3d_mark and test_gpu_compute is: 0.99

Full Laptop Dataframe¶

Source (Laptop Shop)¶

Analyzing number of laptops from each source¶

InĀ [36]:
# Get the unique values and their counts
source_counts = full_relation['laptop_specs_source'].value_counts()

# Plot the unique values and their counts
plt.figure(figsize=(10, 6))
ax = sns.barplot(y=source_counts.index, x=source_counts.values, palette="viridis")
plt.title("Number of laptops per shop", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Source", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
/tmp/ipykernel_110005/3058434414.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(y=source_counts.index, x=source_counts.values, palette="viridis")
No description has been provided for this image

Analysising price grouped by source¶

InĀ [37]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by brand/source
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, y='laptop_specs_source', x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Source", fontsize=16)
plt.ylabel("Source", fontsize=14)
plt.xlabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_110005/3580318046.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation, y='laptop_specs_source', x='laptop_specs_price', palette="viridis")
No description has been provided for this image
InĀ [38]:
# Group by 'laptop_specs_source' and calculate descriptive statistics for 'laptop_specs_price'
price_stats_by_source = full_relation.groupby('laptop_specs_source')['laptop_specs_price'].describe()

# Print the statistics
print(price_stats_by_source)
                     count          mean           std         min  \
laptop_specs_source                                                  
cellphones           264.0  2.940125e+07  2.012227e+07   9490000.0   
fptshop              211.0  2.747815e+07  1.818169e+07   9490000.0   
gearvn               178.0  2.941697e+07  1.311638e+07   8490000.0   
hacom                482.0  2.435385e+07  1.207721e+07   8799000.0   
laptopaz             198.0  2.660227e+07  1.243102e+07  11990000.0   
laptopworld          208.0  3.788760e+07  1.827105e+07  14990000.0   
nguyenkim             91.0  1.987462e+07  9.371410e+06   9490000.0   
phongvu              238.0  2.805796e+07  1.540491e+07   9790000.0   
phucanh              136.0  2.346235e+07  8.047521e+06  11900000.0   
thegioididong        236.0  2.241712e+07  1.075848e+07   7990000.0   

                            25%         50%         75%          max  
laptop_specs_source                                                   
cellphones           17140000.0  23840000.0  34990000.0  182490000.0  
fptshop              16490000.0  21990000.0  31440000.0  128990000.0  
gearvn               19990000.0  25990000.0  36740000.0   83990000.0  
hacom                16799000.0  21199000.0  29374000.0   95699000.0  
laptopaz             17990000.0  23990000.0  30490000.0   85000000.0  
laptopworld          25365000.0  32590000.0  45240000.0  116990000.0  
nguyenkim            15990000.0  18490000.0  21690000.0   80990000.0  
phongvu              18990000.0  23990000.0  30365000.0  106990000.0  
phucanh              18490000.0  21990000.0  27115000.0   78800000.0  
thegioididong        16490000.0  19490000.0  26515000.0  106990000.0  

Brand¶

Analysing number of laptops from each brand¶

InĀ [39]:
# Get the unique values and their counts
brand_counts = full_relation['laptop_specs_brand'].value_counts()

# Plot the unique values and their counts
plt.figure(figsize=(12, 8))
ax = sns.barplot(y=brand_counts.index, x=brand_counts.values, palette="viridis")
plt.title("Number of laptops per brand", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Brand", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
/tmp/ipykernel_110005/2418909123.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(y=brand_counts.index, x=brand_counts.values, palette="viridis")
No description has been provided for this image

Analysising price grouped by brand¶

InĀ [40]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by brand
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, y='laptop_specs_brand', x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Brand", fontsize=16)
plt.ylabel("Brand", fontsize=14)
plt.xlabel("Price", fontsize=14)

plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_110005/2449895171.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation, y='laptop_specs_brand', x='laptop_specs_price', palette="viridis")
No description has been provided for this image

Central Processing Unit (CPU)¶

Basic analysis¶

InĀ [41]:
# Group by 'laptop_specs_cpu' and calculate the mean price and count
mean_price_by_cpu = full_relation.groupby('laptop_specs_cpu')['laptop_specs_price'].agg(['mean', 'count'])
print("Number of unique CPUs:", mean_price_by_cpu.shape[0], end='\n\n')

# Sort the DataFrame by mean price
mean_price_by_cpu = mean_price_by_cpu.sort_values(by='mean', ascending=False)

# Format the mean price as currency
mean_price_by_cpu['mean'] = mean_price_by_cpu['mean'].apply(lambda x: f"{x:,.2f}đ")

# Display the results
print("Top 10 CPUs by Mean Price:")
print(mean_price_by_cpu.head(10), '\n\n')

print("Bottom 10 CPUs by Mean Price:")
print(mean_price_by_cpu.tail(10), '\n\n')


# Sort the DataFrame by count
mean_price_by_cpu = mean_price_by_cpu.sort_values(by='count', ascending=False)

# Display the results
print("Top 10 CPUs by Count:")
print(mean_price_by_cpu.head(10), '\n\n')

print("Bottom 10 CPUs by Count:")
print(mean_price_by_cpu.tail(10), '\n\n')
Number of unique CPUs: 125

Top 10 CPUs by Mean Price:
                                  mean  count
laptop_specs_cpu                             
apple m3 max 16 core   138,740,000.00đ      2
apple m2 max 12 core   105,990,000.00đ      1
apple m4 max 16 core   102,323,333.33đ      3
apple m4 max 14 core    87,240,000.00đ      4
intel core i9 13980hx   85,965,000.00đ      4
intel core i9 11900h    85,000,000.00đ      1
apple m3 max 14 core    82,490,000.00đ      5
intel core i7 13850hx   73,519,500.00đ      4
intel core i9 10885h    72,990,000.00đ      1
intel core i7 1365u     69,699,000.00đ      1 


Bottom 10 CPUs by Mean Price:
                               mean  count
laptop_specs_cpu                          
amd ryzen 5 5500u    12,994,500.00đ      2
intel core i3 1305u  12,767,636.36đ     11
amd ryzen 5 5625u    12,662,666.67đ      3
amd ryzen 7 5700u    12,491,928.57đ     14
intel core i3 1315u  12,069,240.00đ     25
amd ryzen 5 7520u    11,890,947.37đ     19
intel core i3 8145u  11,640,000.00đ      2
intel core i3 1215u  10,547,725.00đ     40
intel core i3 1220p   9,990,000.00đ      1
intel celeron n4500   8,490,000.00đ      2 


Top 10 CPUs by Count:
                                   mean  count
laptop_specs_cpu                              
intel core i5 1335u      18,226,192.86đ    140
intel core ultra 7 155h  35,580,125.00đ    128
intel core i5 1235u      15,344,267.24đ    116
intel core i7 1355u      22,554,482.46đ    114
intel core i7 13620h     26,433,737.86đ    103
intel core i5 13420h     18,852,182.93đ     82
intel core i5 12450h     17,496,858.97đ     78
apple m2 8 core          31,320,589.04đ     73
intel core ultra 5 125h  25,945,826.09đ     69
intel core i5 12500h     20,612,191.18đ     68 


Bottom 10 CPUs by Count:
                                mean  count
laptop_specs_cpu                           
amd ryzen 5 5600u     19,999,000.00đ      1
intel core i5 1345u   24,999,000.00đ      1
amd ryzen 5 3500u     15,790,000.00đ      1
amd ryzen 7 7735h     15,990,000.00đ      1
amd ryzen 5 4500u     17,990,000.00đ      1
amd ryzen 7 4800h     17,890,000.00đ      1
intel core i3 8130u   13,690,000.00đ      1
amd ryzen 5 2500u     14,139,000.00đ      1
intel core i5 10300h  14,990,000.00đ      1
intel core i3 1220p    9,990,000.00đ      1 


Analyzing CPU performance relation with price¶

InĀ [42]:
# Calculate correlations
correlation_multithread_price = full_relation['cpu_specs_multithread_rating'].corr(full_relation['laptop_specs_price'])
correlation_single_thread_price = full_relation['cpu_specs_single_thread_rating'].corr(full_relation['laptop_specs_price'])

# Print the results
print(f"Correlation between multithread_rating and price: {correlation_multithread_price:.2f}")
print(f"Correlation between single_thread_rating and price: {correlation_single_thread_price:.2f}")
Correlation between multithread_rating and price: 0.62
Correlation between single_thread_rating and price: 0.55
InĀ [43]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot single_thread_rating vs price with regression line
sns.regplot(data=full_relation, x='cpu_specs_single_thread_rating', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('Single Thread Rating vs Price')
axes[0].set_xlabel('Single Thread Rating')
axes[0].set_ylabel('Price')

# Plot multithread_rating vs price with regression line
sns.regplot(data=full_relation, x='cpu_specs_multithread_rating', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('Multithread Rating vs Price')
axes[1].set_xlabel('Multithread Rating')
axes[1].set_ylabel('Price')

plt.tight_layout()
plt.show()
No description has been provided for this image

Graphics Processing Unit (GPU)¶

Basic analysis¶

InĀ [44]:
# Group by 'laptop_specs_gpu' and calculate the mean price and count
mean_price_by_gpu = full_relation.groupby('laptop_specs_vga')['laptop_specs_price'].agg(['mean', 'count'])
print("Number of unique GPUs:", mean_price_by_gpu.shape[0], end='\n\n')

# Sort the DataFrame by mean price
mean_price_by_gpu = mean_price_by_gpu.sort_values(by='mean', ascending=False)

# Format the mean price as currency
mean_price_by_gpu['mean'] = mean_price_by_gpu['mean'].apply(lambda x: f"{x:,.2f}đ")

# Display the results
print("Top 10 GPUs by Mean Price:")
print(mean_price_by_gpu.head(10), '\n\n')

print("Bottom 10 GPUs by Mean Price:")
print(mean_price_by_gpu.tail(10), '\n\n')


# Sort the DataFrame by count
mean_price_by_gpu = mean_price_by_gpu.sort_values(by='count', ascending=False)

# Display the results
print("Top 10 GPUs by Count:")
print(mean_price_by_gpu.head(10), '\n\n')

print("Bottom 10 GPUs by Count:")
print(mean_price_by_gpu.tail(10), '\n\n')
Number of unique GPUs: 23

Top 10 GPUs by Mean Price:
                                    mean  count
laptop_specs_vga                               
geforce rtx 4090         103,615,000.00đ      8
geforce rtx 4080          81,046,250.00đ     16
rtx 2000 ada generation   76,385,000.00đ     11
rtx a1000                 58,195,428.57đ      7
geforce rtx 2060          55,990,000.00đ      1
geforce gtx 1650 ti       51,490,000.00đ      1
geforce rtx 4070          50,061,304.35đ     69
rtx a500                  49,122,428.57đ      7
rtx 500 ada generation    48,240,000.00đ      2
geforce rtx 4060          34,202,455.45đ    202 


Bottom 10 GPUs by Mean Price:
                               mean  count
laptop_specs_vga                          
geforce rtx 3050 ti  26,873,333.33đ     18
geforce mx570        26,062,666.67đ      3
radeon rx 7600s      23,490,000.00đ      1
geforce mx450        22,994,500.00đ      2
geforce rtx 3050     22,916,762.82đ    156
geforce mx550        20,529,187.50đ     16
geforce rtx 2050     18,445,879.63đ    108
geforce mx350        17,990,000.00đ      1
geforce gtx 1650     16,215,000.00đ      4
radeon rx 6550m      15,123,333.33đ      3 


Top 10 GPUs by Count:
                                   mean  count
laptop_specs_vga                              
geforce rtx 4060         34,202,455.45đ    202
geforce rtx 4050         28,064,038.04đ    184
geforce rtx 3050         22,916,762.82đ    156
geforce rtx 2050         18,445,879.63đ    108
geforce rtx 4070         50,061,304.35đ     69
geforce rtx 3050 ti      26,873,333.33đ     18
geforce rtx 4080         81,046,250.00đ     16
geforce mx550            20,529,187.50đ     16
geforce rtx 3060         33,245,307.69đ     13
rtx 2000 ada generation  76,385,000.00đ     11 


Bottom 10 GPUs by Count:
                                  mean  count
laptop_specs_vga                             
geforce gtx 1650        16,215,000.00đ      4
radeon rx 6550m         15,123,333.33đ      3
geforce rtx 3070 ti     33,490,000.00đ      3
geforce mx570           26,062,666.67đ      3
rtx 500 ada generation  48,240,000.00đ      2
geforce mx450           22,994,500.00đ      2
geforce gtx 1650 ti     51,490,000.00đ      1
geforce rtx 2060        55,990,000.00đ      1
radeon rx 7600s         23,490,000.00đ      1
geforce mx350           17,990,000.00đ      1 


Analyzing GPU performance relation with price¶

InĀ [45]:
# Calculate the correlation between avg_g3d_mark and price
correlation_avg_g3d_mark_price = full_relation['gpu_specs_avg_g3d_mark'].corr(full_relation['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between avg_g3d_mark and price: {correlation_avg_g3d_mark_price:.2f}")
Correlation between avg_g3d_mark and price: 0.64
InĀ [46]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='gpu_specs_avg_g3d_mark', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Correlation between Avg G3D Mark and Price", fontsize=16)
plt.xlabel("Avg G3D Mark", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Random Access Memory (RAM)¶

Basic analysis¶

InĀ [47]:
# Print unique values and their counts for RAM amount
ram_amount_counts = full_relation['laptop_specs_ram_amount'].value_counts()
print("Unique RAM amounts and their counts:")
print(ram_amount_counts)

# Print unique values and their counts for RAM type
ram_type_counts = full_relation['laptop_specs_ram_type'].value_counts()
print("\nUnique RAM types and their counts:")
print(ram_type_counts)
Unique RAM amounts and their counts:
laptop_specs_ram_amount
16.0     1313
8.0       567
32.0      219
24.0       40
12.0       35
4.0        21
36.0       15
64.0       11
512.0       8
48.0        5
18.0        4
96.0        1
128.0       1
Name: count, dtype: int64

Unique RAM types and their counts:
laptop_specs_ram_type
ddr5    1174
ddr4     875
Name: count, dtype: int64
InĀ [48]:
# Convert RAM amount to categorical type
full_relation['laptop_specs_ram_amount'] = pd.Categorical(full_relation['laptop_specs_ram_amount'])

# Plot the unique values and their counts horizontally
plt.figure(figsize=(12, 8))
ax = sns.barplot(x=ram_amount_counts.index.astype(int).astype(str), y=ram_amount_counts.values, palette="viridis")
plt.title("Number of Laptops by RAM Amount", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("RAM Amount (GB)", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
/tmp/ipykernel_110005/3967087379.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(x=ram_amount_counts.index.astype(int).astype(str), y=ram_amount_counts.values, palette="viridis")
No description has been provided for this image
InĀ [49]:
# Plot the pie chart for RAM types
plt.figure(figsize=(8, 8))
ram_type_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#99ff99'], labels=ram_type_counts.index, wedgeprops=dict(width=0.3))

# Add title
plt.title("Distribution of RAM Types", fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Analyzing RAM performance relation with price¶

InĀ [50]:
# Calculate the correlation between RAM amount and price
correlation_ram_price = full_relation['laptop_specs_ram_amount'].astype(float).corr(full_relation['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between RAM amount and price: {correlation_ram_price:.2f}")
Correlation between RAM amount and price: 0.25
InĀ [51]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by RAM amount
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, x='laptop_specs_ram_amount', y='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by RAM Amount", fontsize=16)
plt.xlabel("RAM Amount (GB)", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_110005/1837874617.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation, x='laptop_specs_ram_amount', y='laptop_specs_price', palette="viridis")
No description has been provided for this image
InĀ [52]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a KDE plot for price distribution by RAM type
plt.figure(figsize=(14, 8))
sns.kdeplot(data=full_relation, x='laptop_specs_price', hue='laptop_specs_ram_type', fill=True, palette="viridis")

# Add titles and labels
plt.title("Price Distribution by RAM Type", fontsize=16)
plt.xlabel("Price", fontsize=14)
plt.ylabel("Density", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Storage¶

Basic analysis¶

InĀ [53]:
# Clone the full_relation DataFrame
full_relation_clone = full_relation.copy()
# Convert 'laptop_specs_storage_amount' to numeric type
full_relation_clone['laptop_specs_storage_amount'] = pd.to_numeric(full_relation_clone['laptop_specs_storage_amount'], errors='coerce')

# Filter the DataFrame
full_relation_clone = full_relation_clone[full_relation_clone['laptop_specs_storage_amount'] >= 128]

# Print unique values and their counts for storage amount
storage_amount_counts = full_relation_clone['laptop_specs_storage_amount'].value_counts()
print("Unique storage amounts and their counts:")
print(storage_amount_counts)

# Print unique values and their counts for storage type
storage_type_counts = full_relation_clone['laptop_specs_storage_type'].value_counts()
print("\nUnique storage types and their counts:")
print(storage_type_counts)
Unique storage amounts and their counts:
laptop_specs_storage_amount
512.0     1423
1024.0     387
256.0      126
2048.0      21
8192.0       1
Name: count, dtype: int64

Unique storage types and their counts:
laptop_specs_storage_type
ssd    1897
hdd       4
Name: count, dtype: int64
InĀ [54]:
# Convert storage amount to categorical type
full_relation_clone['laptop_specs_storage_amount'] = pd.Categorical(full_relation_clone['laptop_specs_storage_amount'])

# Plot the unique values and their counts horizontally
storage_amount_counts = full_relation_clone['laptop_specs_storage_amount'].value_counts()
plt.figure(figsize=(12, 8))
ax = sns.barplot(x=storage_amount_counts.index.astype(int).astype(str), y=storage_amount_counts.values, palette="viridis")
plt.title("Number of Laptops by Storage Amount", fontsize=16)
plt.xlabel("Storage Amount (GB)", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
/tmp/ipykernel_110005/4095148162.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(x=storage_amount_counts.index.astype(int).astype(str), y=storage_amount_counts.values, palette="viridis")
No description has been provided for this image

Analyzing Storage relation with price¶

InĀ [55]:
# Calculate the correlation between storage amount and price
correlation_storage_price = full_relation_clone['laptop_specs_storage_amount'].astype(float).corr(full_relation_clone['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between storage amount and price: {correlation_storage_price:.2f}")
Correlation between storage amount and price: 0.60
InĀ [56]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by storage amount
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation_clone, x='laptop_specs_storage_amount', y='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Storage Amount", fontsize=16)
plt.xlabel("Storage Amount (GB)", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_110005/4036590408.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation_clone, x='laptop_specs_storage_amount', y='laptop_specs_price', palette="viridis")
No description has been provided for this image

Screen Features¶

Basic analysis¶

InĀ [57]:
# Calculate summary statistics for screen size, refresh rate, and brightness
screen_size_stats = full_relation['laptop_specs_screen_size'].describe()
refresh_rate_stats = full_relation['laptop_specs_screen_refresh_rate'].describe()
brightness_stats = full_relation['laptop_specs_screen_brightness'].describe()

# Print the results
print("Summary Statistics for Screen Size:")
print(screen_size_stats)

print("\nSummary Statistics for Screen Refresh Rate:")
print(refresh_rate_stats)

print("\nSummary Statistics for Screen Brightness:")
print(brightness_stats)
Summary Statistics for Screen Size:
count    2224.000000
mean       15.054362
std         0.962808
min        11.600000
25%        14.000000
50%        15.600000
75%        15.600000
max        18.000000
Name: laptop_specs_screen_size, dtype: float64

Summary Statistics for Screen Refresh Rate:
count    1400.000000
mean      120.255000
std        50.331226
min        60.000000
25%        60.000000
50%       120.000000
75%       144.000000
max       480.000000
Name: laptop_specs_screen_refresh_rate, dtype: float64

Summary Statistics for Screen Brightness:
count    1288.000000
mean      348.354037
std       121.155601
min       220.000000
25%       250.000000
50%       300.000000
75%       400.000000
max      1200.000000
Name: laptop_specs_screen_brightness, dtype: float64
InĀ [58]:
# Print unique values and their counts for screen resolution
screen_resolution_counts = full_relation['laptop_specs_screen_resolution'].value_counts()
print("Unique screen resolutions and their counts:")
print(screen_resolution_counts)

# Plot the unique values and their counts
plt.figure(figsize=(12, 8))
ax = sns.barplot(y=screen_resolution_counts.index, x=screen_resolution_counts.values, palette="viridis")
plt.title("Number of Laptops by Screen Resolution", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Screen Resolution", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
Unique screen resolutions and their counts:
laptop_specs_screen_resolution
1920x1080    1142
1920x1200     376
2560x1600     212
2880x1800     178
3024x1964      39
2560x1664      38
2880x1620      20
3200x2000      20
2880x1920      16
2880x1864      15
2560x1440      14
3456x2234      13
3840x2400      11
3072x1920      10
2560x1644      10
2048x1280       9
2240x1400       8
1366x768        5
3456x2160       3
2800x1800       2
2800x1620       1
1980x1080       1
2160x1440       1
3201x2000       1
2960x1848       1
2220x1080       1
2256x1504       1
3000x2000       1
Name: count, dtype: int64
/tmp/ipykernel_110005/3570925166.py:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(y=screen_resolution_counts.index, x=screen_resolution_counts.values, palette="viridis")
No description has been provided for this image

Analysis of screen features with price¶

InĀ [59]:
# Clone the full_relation DataFrame
full_relation_clone = full_relation.copy()

# Get the counts of each screen resolution
screen_resolution_counts = full_relation_clone['laptop_specs_screen_resolution'].value_counts()

# Filter out screen resolutions with count < 20
filtered_screen_resolutions = screen_resolution_counts[screen_resolution_counts >= 20].index

# Filter the DataFrame
full_relation_clone = full_relation_clone[full_relation_clone['laptop_specs_screen_resolution'].isin(filtered_screen_resolutions)]

# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by screen resolution
# Sort the DataFrame by screen resolution
full_relation_clone = full_relation_clone.sort_values(by='laptop_specs_screen_resolution')

plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation_clone, y='laptop_specs_screen_resolution', x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Screen Resolution", fontsize=16)
plt.ylabel("Screen Resolution", fontsize=14)
plt.xlabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_110005/1749291397.py:21: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation_clone, y='laptop_specs_screen_resolution', x='laptop_specs_price', palette="viridis")
No description has been provided for this image
InĀ [60]:
# Print the correlation
correlation_screen_size_price = full_relation['laptop_specs_screen_size'].corr(full_relation['laptop_specs_price'])
correlation_refresh_rate_price = full_relation['laptop_specs_screen_refresh_rate'].corr(full_relation['laptop_specs_price'])
correlation_brightness_price = full_relation['laptop_specs_screen_brightness'].corr(full_relation['laptop_specs_price'])

print(f"Correlation between screen size and price: {correlation_screen_size_price:.2f}")
print(f"Correlation between screen refresh rate and price: {correlation_refresh_rate_price:.2f}")
print(f"Correlation between screen brightness and price: {correlation_brightness_price:.2f}")

# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot price vs. screen size
sns.regplot(data=full_relation, x='laptop_specs_screen_size', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'alpha':0.7})
axes[0].set_title('Price vs. Screen Size')
axes[0].set_xlabel('Screen Size (inches)')
axes[0].set_ylabel('Price (VND)')

# Plot price vs. screen refresh rate
sns.regplot(data=full_relation, x='laptop_specs_screen_refresh_rate', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'alpha':0.7})
axes[1].set_title('Price vs. Screen Refresh Rate')
axes[1].set_xlabel('Screen Refresh Rate (Hz)')
axes[1].set_ylabel('Price (VND)')

# Plot price vs. screen brightness
sns.regplot(data=full_relation, x='laptop_specs_screen_brightness', y='laptop_specs_price', ax=axes[2], color='red', scatter_kws={'alpha':0.7})
axes[2].set_title('Price vs. Screen Brightness')
axes[2].set_xlabel('Screen Brightness (nits)')
axes[2].set_ylabel('Price (VND)')

plt.tight_layout()
plt.show()
Correlation between screen size and price: 0.14
Correlation between screen refresh rate and price: 0.34
Correlation between screen brightness and price: 0.46
No description has been provided for this image

Portability Features¶

Weight¶

Basic analysis

InĀ [61]:
# Print summary statistics for weight
weight_stats = full_relation['laptop_specs_weight'].describe()
print("Summary Statistics for Weight:")
print(weight_stats)
Summary Statistics for Weight:
count    1853.000000
mean        1.801285
std         0.450023
min         0.879000
25%         1.410000
50%         1.700000
75%         2.200000
max         4.000000
Name: laptop_specs_weight, dtype: float64
InĀ [62]:
# Plot the distribution of laptop weights
plt.figure(figsize=(10, 6))
sns.histplot(full_relation['laptop_specs_weight'].dropna(), kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of Laptop Weights", fontsize=16)
plt.xlabel("Weight (kg)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image

Analysis of weight with price

InĀ [63]:
# Calculate the correlation between weight and price
correlation_weight_price = full_relation['laptop_specs_weight'].corr(full_relation['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between weight and price: {correlation_weight_price:.2f}")
Correlation between weight and price: 0.26
InĀ [64]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='laptop_specs_weight', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Weight vs Price", fontsize=16)
plt.xlabel("Weight (kg)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Length, Width, Height¶

Basic analysis

InĀ [65]:
# Calculate summary statistics for length, width, and height
length_stats = full_relation['laptop_specs_height'].describe()
width_stats = full_relation['laptop_specs_width'].describe()
height_stats = full_relation['laptop_specs_depth'].describe()

# Print the results
print("Summary Statistics for Length:")
print(length_stats)

print("\nSummary Statistics for Width:")
print(width_stats)

print("\nSummary Statistics for Height:")
print(height_stats)
Summary Statistics for Length:
count    1657.000000
mean        1.947435
std         0.646176
min         0.670000
25%         1.690000
50%         1.890000
75%         2.170000
max        22.700000
Name: laptop_specs_height, dtype: float64

Summary Statistics for Width:
count    1658.00000
mean       34.20111
std         2.47994
min         2.10000
25%        31.71000
50%        35.62000
75%        35.96000
max        52.30000
Name: laptop_specs_width, dtype: float64

Summary Statistics for Height:
count    1658.000000
mean       23.622322
std         1.933157
min         1.000000
25%        22.120000
50%        23.530000
75%        25.100000
max        32.000000
Name: laptop_specs_depth, dtype: float64
InĀ [66]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot the distribution of length
sns.histplot(full_relation['laptop_specs_height'].dropna(), kde=True, color='blue', bins=30, ax=axes[0])
axes[0].set_title("Distribution of Laptop Length", fontsize=16)
axes[0].set_xlabel("Length (cm)", fontsize=14)
axes[0].set_ylabel("Frequency", fontsize=14)

# Plot the distribution of width
sns.histplot(full_relation['laptop_specs_width'].dropna(), kde=True, color='green', bins=30, ax=axes[1])
axes[1].set_title("Distribution of Laptop Width", fontsize=16)
axes[1].set_xlabel("Width (cm)", fontsize=14)
axes[1].set_ylabel("Frequency", fontsize=14)

# Plot the distribution of height
sns.histplot(full_relation['laptop_specs_depth'].dropna(), kde=True, color='red', bins=30, ax=axes[2])
axes[2].set_title("Distribution of Laptop Height", fontsize=16)
axes[2].set_xlabel("Height (cm)", fontsize=14)
axes[2].set_ylabel("Frequency", fontsize=14)

plt.tight_layout()
plt.show()
No description has been provided for this image

Analysis of dimensions with price

InĀ [67]:
# Calculate the correlation between length, width, height, and price
correlation_length_price = full_relation['laptop_specs_height'].corr(full_relation['laptop_specs_price'])
correlation_width_price = full_relation['laptop_specs_width'].corr(full_relation['laptop_specs_price'])
correlation_height_price = full_relation['laptop_specs_depth'].corr(full_relation['laptop_specs_price'])

# Print the correlation results
print(f"Correlation between length and price: {correlation_length_price:.2f}")
print(f"Correlation between width and price: {correlation_width_price:.2f}")
print(f"Correlation between height and price: {correlation_height_price:.2f}")
Correlation between length and price: 0.06
Correlation between width and price: -0.06
Correlation between height and price: 0.17
InĀ [68]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot length vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_height', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('Length vs Price')
axes[0].set_xlabel('Length (cm)')
axes[0].set_ylabel('Price (VND)')

# Plot width vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_width', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('Width vs Price')
axes[1].set_xlabel('Width (cm)')
axes[1].set_ylabel('Price (VND)')

# Plot height vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_depth', y='laptop_specs_price', ax=axes[2], color='red', scatter_kws={'s': 10})
axes[2].set_title('Height vs Price')
axes[2].set_xlabel('Height (cm)')
axes[2].set_ylabel('Price (VND)')

plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [69]:
# Calculate the product of length, width, and height
full_relation_clone['volume'] = full_relation_clone['laptop_specs_height'] * full_relation_clone['laptop_specs_width'] * full_relation_clone['laptop_specs_depth']

# Calculate the correlation between volume and price
correlation_volume_price = full_relation_clone['volume'].corr(full_relation_clone['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between volume and price: {correlation_volume_price:.2f}")

# Plot the correlation between volume and price
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation_clone, x='volume', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Volume vs Price", fontsize=16)
plt.xlabel("Volume (cm³)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
Correlation between volume and price: 0.07
No description has been provided for this image

Battery and Power¶

Basic Analysis¶

InĀ [70]:
# Calculate summary statistics for battery amount and battery cells
battery_amount_stats = full_relation['laptop_specs_battery_capacity'].describe()
battery_cells_stats = full_relation['laptop_specs_battery_cells'].describe()

# Print the results
print("Summary Statistics for Battery Capacity:")
print(battery_amount_stats)

print("\nSummary Statistics for Battery Cells:")
print(battery_cells_stats)
Summary Statistics for Battery Capacity:
count    1892.000000
mean       59.638541
std        23.344431
min        30.000000
25%        47.000000
50%        56.000000
75%        70.000000
max       800.000000
Name: laptop_specs_battery_capacity, dtype: float64

Summary Statistics for Battery Cells:
count    1466.000000
mean        3.491132
std         0.663214
min         2.000000
25%         3.000000
50%         3.000000
75%         4.000000
max         6.000000
Name: laptop_specs_battery_cells, dtype: float64
InĀ [71]:
# Plot the distribution of battery capacity
plt.figure(figsize=(10, 6))
sns.histplot(full_relation['laptop_specs_battery_capacity'].dropna(), kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of Laptop Battery Capacity", fontsize=16)
plt.xlabel("Battery Capacity (Wh)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image

Analysis of battery and power features with price

InĀ [72]:
# Calculate the correlation between battery capacity and price
correlation_battery_capacity_price = full_relation['laptop_specs_battery_capacity'].corr(full_relation['laptop_specs_price'])

# Calculate the correlation between battery cells and price
correlation_battery_cells_price = full_relation['laptop_specs_battery_cells'].corr(full_relation['laptop_specs_price'])

# Print the correlation results
print(f"Correlation between battery capacity and price: {correlation_battery_capacity_price:.2f}")
print(f"Correlation between battery cells and price: {correlation_battery_cells_price:.2f}")
Correlation between battery capacity and price: 0.47
Correlation between battery cells and price: 0.55
InĀ [73]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='laptop_specs_battery_capacity', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Battery Capacity vs Price", fontsize=16)
plt.xlabel("Battery Capacity (Wh)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Connectivity Features¶

Basic analysis¶

InĀ [74]:
# Print unique values and their counts for number of USB-A ports
usb_a_counts = full_relation['laptop_specs_number_usb_a_ports'].value_counts()
print("Unique values and counts for number of USB-A ports:")
print(usb_a_counts)

# Print unique values and their counts for number of USB-C ports
usb_c_counts = full_relation['laptop_specs_number_usb_c_ports'].value_counts()
print("\nUnique values and counts for number of USB-C ports:")
print(usb_c_counts)

# Print unique values and their counts for number of HDMI ports
hdmi_counts = full_relation['laptop_specs_number_hdmi_ports'].value_counts()
print("\nUnique values and counts for number of HDMI ports:")
print(hdmi_counts)

# Print unique values and their counts for number of Ethernet ports
ethernet_counts = full_relation['laptop_specs_number_ethernet_ports'].value_counts()
print("\nUnique values and counts for number of Ethernet ports:")
print(ethernet_counts)

# Print unique values and their counts for number of audio jacks
audio_jack_counts = full_relation['laptop_specs_number_audio_jacks'].value_counts()
print("\nUnique values and counts for number of audio jacks:")
print(audio_jack_counts)
Unique values and counts for number of USB-A ports:
laptop_specs_number_usb_a_ports
0.0     906
2.0     545
3.0     347
1.0     177
4.0      53
6.0       4
5.0       3
12.0      3
8.0       2
Name: count, dtype: int64

Unique values and counts for number of USB-C ports:
laptop_specs_number_usb_c_ports
1.0    857
2.0    540
0.0    503
3.0     61
4.0     51
8.0     22
5.0      6
Name: count, dtype: int64

Unique values and counts for number of HDMI ports:
laptop_specs_number_hdmi_ports
1.0    1788
0.0     252
Name: count, dtype: int64

Unique values and counts for number of Ethernet ports:
laptop_specs_number_ethernet_ports
0.0    1639
1.0     401
Name: count, dtype: int64

Unique values and counts for number of audio jacks:
laptop_specs_number_audio_jacks
0.0    1082
1.0     958
Name: count, dtype: int64
InĀ [75]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the figure and axes
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

# Plot the pie chart for number of USB-A ports
usb_a_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[0],
    colors=sns.color_palette('pastel', len(usb_a_counts)),
    labels=None  # Remove labels
)
axes[0].set_title("Distribution of USB-A Ports")
axes[0].set_ylabel('')
axes[0].legend(usb_a_counts.index, title="USB-A Ports", loc="best")

# Plot the pie chart for number of USB-C ports
usb_c_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[1],
    colors=sns.color_palette('pastel', len(usb_c_counts)),
    labels=None  # Remove labels
)
axes[1].set_title("Distribution of USB-C Ports")
axes[1].set_ylabel('')
axes[1].legend(usb_c_counts.index, title="USB-C Ports", loc="best")

# Plot the pie chart for number of HDMI ports
hdmi_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[2],
    colors=sns.color_palette('pastel', len(hdmi_counts)),
    labels=None  # Remove labels
)
axes[2].set_title("Distribution of HDMI Ports")
axes[2].set_ylabel('')
axes[2].legend(hdmi_counts.index, title="HDMI Ports", loc="best")

# Plot the pie chart for number of Ethernet ports
ethernet_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[3],
    colors=sns.color_palette('pastel', len(ethernet_counts)),
    labels=None  # Remove labels
)
axes[3].set_title("Distribution of Ethernet Ports")
axes[3].set_ylabel('')
axes[3].legend(ethernet_counts.index, title="Ethernet Ports", loc="best")

# Plot the pie chart for number of Audio Jacks
audio_jack_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[4],
    colors=sns.color_palette('pastel', len(audio_jack_counts)),
    labels=None  # Remove labels
)
axes[4].set_title("Distribution of Audio Jacks")
axes[4].set_ylabel('')
axes[4].legend(audio_jack_counts.index, title="Audio Jacks", loc="best")

# Remove the last empty subplot
fig.delaxes(axes[5])

# Adjust layout
plt.tight_layout()
plt.show()
No description has been provided for this image

Analysis connectivity to price¶

InĀ [76]:
# Calculate the correlation between connectivity features and price
correlation_usb_a_price = full_relation['laptop_specs_number_usb_a_ports'].corr(full_relation['laptop_specs_price'])
correlation_usb_c_price = full_relation['laptop_specs_number_usb_c_ports'].corr(full_relation['laptop_specs_price'])
correlation_hdmi_price = full_relation['laptop_specs_number_hdmi_ports'].corr(full_relation['laptop_specs_price'])
correlation_ethernet_price = full_relation['laptop_specs_number_ethernet_ports'].corr(full_relation['laptop_specs_price'])
correlation_audio_jack_price = full_relation['laptop_specs_number_audio_jacks'].corr(full_relation['laptop_specs_price'])

# Print the correlation results
print(f"Correlation between number of USB-A ports and price: {correlation_usb_a_price:.2f}")
print(f"Correlation between number of USB-C ports and price: {correlation_usb_c_price:.2f}")
print(f"Correlation between number of HDMI ports and price: {correlation_hdmi_price:.2f}")
print(f"Correlation between number of Ethernet ports and price: {correlation_ethernet_price:.2f}")
print(f"Correlation between number of audio jacks and price: {correlation_audio_jack_price:.2f}")
Correlation between number of USB-A ports and price: -0.05
Correlation between number of USB-C ports and price: 0.07
Correlation between number of HDMI ports and price: -0.12
Correlation between number of Ethernet ports and price: -0.04
Correlation between number of audio jacks and price: 0.06

Software Features¶

Default OS¶

Basic analysis

InĀ [77]:
# Print unique values and their counts for default OS
os_counts = full_relation['laptop_specs_default_os'].value_counts()

# Replace 'window' with 'windows' in the 'laptop_specs_default_os' column
full_relation['laptop_specs_default_os'] = full_relation['laptop_specs_default_os'].apply(lambda x: 'windows' if x is not None and 'window' in x.lower() else x)

# Print the updated unique OS and their counts
os_counts = full_relation['laptop_specs_default_os'].value_counts()
print("Unique OS and their counts:")
print(os_counts)
Unique OS and their counts:
laptop_specs_default_os
windows      1949
macos         166
linux          19
chrome os       2
Name: count, dtype: int64
InĀ [78]:
# Plot the pie chart for default OS
plt.figure(figsize=(8, 8))
os_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    colors=['#66b3ff', '#99ff99', '#ffcc99', '#ff9999'],
    labels=None,  # Remove labels from the pie chart
    wedgeprops=dict(width=0.3),
    textprops={'fontsize': 10}  # Adjust text size
)

# Add a legend for categories
plt.legend(os_counts.index, loc="best")

# Add title
plt.title("Distribution of Default OS", fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Warranty¶

InĀ [79]:
# Print unique values and their counts for warranty
warranty_counts = full_relation['laptop_specs_warranty'].value_counts()
print("Unique warranty values and their counts:")
print(warranty_counts)
Unique warranty values and their counts:
laptop_specs_warranty
12.0    1017
24.0     829
36.0      88
18.0       1
Name: count, dtype: int64
InĀ [80]:
# Print correlation
correlation_warranty_price = full_relation['laptop_specs_warranty'].corr(full_relation['laptop_specs_price'])
print(f"Correlation between warranty and price: {correlation_warranty_price:.2f}")

# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by warranty
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, x='laptop_specs_warranty', y='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Warranty", fontsize=16)
plt.xlabel("Warranty (months)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
Correlation between warranty and price: 0.11
/tmp/ipykernel_110005/2459959301.py:10: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation, x='laptop_specs_warranty', y='laptop_specs_price', palette="viridis")
No description has been provided for this image

Target Feature: price¶

Basic statistics

InĀ [81]:
# Calculate basic statistics for the price column
price_stats = full_relation['laptop_specs_price'].describe()

# Print the statistics
print("Basic Statistics for Price:")
print(price_stats)
Basic Statistics for Price:
count    2.242000e+03
mean     2.705181e+07
std      1.520890e+07
min      7.990000e+06
25%      1.759900e+07
50%      2.299000e+07
75%      3.114475e+07
max      1.824900e+08
Name: laptop_specs_price, dtype: float64

Visualizing the distribution

InĀ [82]:
# Plot the distribution of laptop prices
plt.figure(figsize=(12, 6))
sns.histplot(full_relation['laptop_specs_price'], kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of Laptop Prices", fontsize=16)
plt.xlabel("Price (VND)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image
InĀ [83]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=full_relation, x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Boxplot of Laptop Prices", fontsize=16)
plt.xlabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_110005/2849463995.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation, x='laptop_specs_price', palette="viridis")
No description has been provided for this image